Define heatmap function
library(pheatmap)
create_gene_heatmap <- function(de_results_df, dge_object, font_size = 12, num_genes = 20) {
# Subset the genes
top_genes <- de_results_df$Symbol[1:num_genes]
subset_matrix <- dge_object$logCPM[top_genes, colnames(dge_object$counts)]
# Create annotation dataframe
sample_info <- dge_object$samples
annotation_df <- data.frame(
Donor = sample_info$Donor,
Subset = sample_info$Subset
)
rownames(annotation_df) <- colnames(subset_matrix)
# Define custom colors for annotations
annotation_colors <- list(
Subset = c(CD4 = "darkgreen", CD8 = "purple"), # Custom distinct colors for Subset
Donor = c(Donor_47 = "orange", Donor_64 = "blue") # Custom distinct colors for Donor
)
# Reorder gene expression matrix
ordered_samples <- order(sample_info$Subset, sample_info$Donor)
subset_matrix <- subset_matrix[, ordered_samples]
sample_info <- sample_info[ordered_samples, ]
# Define custom color scheme for blue-white-red
custom_colors <- colorRampPalette(c("blue", "white", "red"))(50)
# Create the heatmap
heatmap <- pheatmap(
subset_matrix,
scale = "row",
cluster_rows = TRUE,
cluster_cols = FALSE,
treeheight_row = 0,
color = custom_colors, # Use blue-white-red color scheme
show_colnames = FALSE,
annotation_col = annotation_df,
annotation_colors = annotation_colors, # Add custom annotation colors
fontsize = font_size,
silent = FALSE
)
}The aim of this notebook is to identify genes that differ between CD4 and CD8 subsets over time. This will be a basic DE analysis informed by the time course centric analysis conducted in notebook 3A splines timecourse.
I sequenced these samples with Prime-seq on extracted RNAs already. Results were not that great. Here reprocess the same samples with TIRE-seq for a head to head comparison.
The processing went as planned. Full writeup available at ELN link
This was generated in 2A_clustering
sce <- readRDS(here::here(
"data/TIRE_Tcell/SCEs/tcell_cluster.sce.rds"
))
# Reorder factor levels
sce$Timepoint <- factor(sce$Timepoint,
levels = c("Day_0", "Day_1", "Day_2",
"Day_5", "Day_6", "Day_8",
"Day_9", "Day_13", "Day_15"))
dge <- scran::convertTo(sce, type="edgeR")
tb <- as_tibble(dge$samples)Check on the perturbations conducted in this experiment:
Filter for genes that have at least 5 counts.
Currently keep 8,249
## Mode FALSE TRUE
## logical 12666 8249
Correct for composition biases by computing normalization factors with the trimmed mean of M-values method.
Combine time and subset into a new factor. For an interaction analysis it is easier to be more explicit than having fancy multiplication terms i.e timepoint * subset
The intercept term in a design matrix represents the baseline or reference level in your experimental design. Specifically:
dge$samples$Subset_Time <- paste(dge$samples$Subset, dge$samples$Timepoint, sep="-")
sm <- model.matrix(~0+Subset_Time + Donor, data=dge$samples)
# hypens not allowed
colnames(sm) <- make.names(colnames(sm), unique = FALSE, allow_ = TRUE)Convert the day from a factor to a numeric
sce$TimeDay <- str_split(sce$Timepoint, pattern = "_", simplify = T)[,2]
sce$TimeDay <- factor(sce$TimeDay, levels = sort(as.numeric(unique(sce$TimeDay))))interest <- c("IL2", "IFNG", "CDK1", "MKI67")
sce <- logNormCounts(sce)
plt0 <- plotExpression(sce, features = interest, x = "TimeDay", assay.type = "logcounts",
colour_by = "Subset", exprs_values = "counts", ncol=4) +
theme_Publication() + xlab("Day") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plt0Decide on the contrasts.
We focus on the early timepoints which is where all the changes take place.
contr.matrix <- makeContrasts(
Day0 = Subset_TimeCD8.Day_0 - Subset_TimeCD4.Day_0,
Day1 = Subset_TimeCD8.Day_1 - Subset_TimeCD4.Day_1,
Day2 = Subset_TimeCD8.Day_2 - Subset_TimeCD4.Day_2,
Day5 = Subset_TimeCD8.Day_5 - Subset_TimeCD4.Day_5,
Day15 = Subset_TimeCD8.Day_15 - Subset_TimeCD4.Day_15,
levels = colnames(sm))
contr.matrix %>%
kable()| Day0 | Day1 | Day2 | Day5 | Day15 | |
|---|---|---|---|---|---|
| Subset_TimeCD4.Day_0 | -1 | 0 | 0 | 0 | 0 |
| Subset_TimeCD4.Day_1 | 0 | -1 | 0 | 0 | 0 |
| Subset_TimeCD4.Day_13 | 0 | 0 | 0 | 0 | 0 |
| Subset_TimeCD4.Day_15 | 0 | 0 | 0 | 0 | -1 |
| Subset_TimeCD4.Day_2 | 0 | 0 | -1 | 0 | 0 |
| Subset_TimeCD4.Day_5 | 0 | 0 | 0 | -1 | 0 |
| Subset_TimeCD4.Day_6 | 0 | 0 | 0 | 0 | 0 |
| Subset_TimeCD4.Day_8 | 0 | 0 | 0 | 0 | 0 |
| Subset_TimeCD8.Day_0 | 1 | 0 | 0 | 0 | 0 |
| Subset_TimeCD8.Day_1 | 0 | 1 | 0 | 0 | 0 |
| Subset_TimeCD8.Day_13 | 0 | 0 | 0 | 0 | 0 |
| Subset_TimeCD8.Day_15 | 0 | 0 | 0 | 0 | 1 |
| Subset_TimeCD8.Day_2 | 0 | 0 | 1 | 0 | 0 |
| Subset_TimeCD8.Day_5 | 0 | 0 | 0 | 1 | 0 |
| Subset_TimeCD8.Day_6 | 0 | 0 | 0 | 0 | 0 |
| Subset_TimeCD8.Day_8 | 0 | 0 | 0 | 0 | 0 |
| DonorDonor_64 | 0 | 0 | 0 | 0 | 0 |
In each contrast, the format is A - B where:
So in this analysis the positive log fold changes will be CD8 cells.
Fit this model using limma/ voom.
par(mfrow=c(1,2))
v <- voom(dge, sm, plot=TRUE)
vfit <- lmFit(v, sm)
vfit <- contrasts.fit(vfit, contrasts=contr.matrix)
efit <- eBayes(vfit)
plotSA(efit, main="Final model: Mean-variance trend")CD8 is highest which is a good sanity check.
Day1_cd8.vs.cd4 <- topTable(efit, coef=2, n=length(efit$genes$ID), sort.by = "logFC")
results <- as_tibble(Day1_cd8.vs.cd4)
results$ID <- rownames(Day1_cd8.vs.cd4)
# add a column of NAs
results$DElabel <- "n/s"
# if log2Foldchange > 1 or < -1 and pvalue < 0.05, set as "UP"
results$DElabel[results$logFC > 1 & results$adj.P.Val < 0.1] <- "CD8"
results$DElabel[results$logFC < -1 & results$adj.P.Val < 0.1] <- "CD4"Add gene labels
results$genelabels <- ""
results$genelabels[results$Symbol == "CD8B"] <- "CD8B"
results$genelabels[results$Symbol == "CD8A"] <- "CD8A"
results$genelabels[results$Symbol == "XCL1"] <- "XCL1"
results$genelabels[results$Symbol == "FCGR3A"] <- "FCGR3A"
results$genelabels[results$Symbol == "CD4"] <- "CD4"
results$genelabels[results$Symbol == "TSHZ2"] <- "TSHZ2"
results$genelabels[results$Symbol == "FBLN7"] <- "FBLN7"results$DElabel <- factor(results$DElabel, levels = c("CD8", "n/s", "CD4")) # Adjust as per your actual labels
plt1 <- ggplot(data = results, aes(x = logFC, y = -log10(adj.P.Val), colour = DElabel, label = genelabels)) +
geom_point(alpha = 0.33, size = 1.5) +
geom_text_repel(size = 4, colour = "black") +
guides(colour = guide_legend(override.aes = list(size = 3, alpha = 1))) +
scale_color_manual(
values = c("darkorange", "grey", "darkblue"), # Colors in the desired order
name = "Upregulated",
labels = c("CD8", "n/s", "CD4") # Optional: Add custom labels
) +
geom_vline(xintercept = 1, linetype = "dotted") +
geom_vline(xintercept = -1, linetype = "dotted") +
theme_Publication()
plt1dge$logCPM <- cpm(dge, log=TRUE, prior.count=1)
dge_subset <- dge[,dge$samples$Timepoint == "Day_1"]
create_gene_heatmap(de_results_df = results, dge_object = dge_subset, font_size = 12, num_genes=15)Need to convert geneIDs from ensembl to enterez
geneids <- as.data.frame(v$genes$ID)
colnames(geneids) <- "ENSEMBL"
geneids$entrez <- mapIds(org.Hs.eg.db, keys = geneids$ENSEMBL, keytype = "ENSEMBL", column = "ENTREZID")
load("data/MSigDB/human_H_v5p2.rdata")
idx <- ids2indices(Hs.H,identifiers = geneids$entrez)Myc targets up in CD4 and complement up in CD8
Day 2 is where most of the changes take place and the T cells are taking off after being stimulated.
Day2_cd8.vs.cd4 <- topTable(efit, coef=3, n=length(efit$genes$ID), sort.by = "logFC")
results <- as_tibble(Day2_cd8.vs.cd4)
results$ID <- rownames(Day2_cd8.vs.cd4)
# add a column of NAs
results$DElabel <- "n/s"
# if log2Foldchange > 1 or < -1 and pvalue < 0.05, set as "UP"
results$DElabel[results$logFC > 1 & results$adj.P.Val < 0.1] <- "CD8"
results$DElabel[results$logFC < -1 & results$adj.P.Val < 0.1] <- "CD4"Add gene labels
results$genelabels <- ""
results$genelabels[results$Symbol == "CD8B"] <- "CD8B"
results$genelabels[results$Symbol == "CD8A"] <- "CD8A"
results$genelabels[results$Symbol == "CD8B2"] <- "CD8B2"
results$genelabels[results$Symbol == "XCL1"] <- "XCL1"
results$genelabels[results$Symbol == "CD4"] <- "CD4"
results$genelabels[results$Symbol == "TSHZ2"] <- "TSHZ2"
results$genelabels[results$Symbol == "FBLN7"] <- "FBLN7"
results$genelabels[results$Symbol == "CTSL"] <- "CTSL"# Reorder the levels of DElabel for proper legend ordering
results$DElabel <- factor(results$DElabel, levels = c("CD8", "n/s", "CD4")) # Ensure correct order
plt2 <- ggplot(data = results, aes(x = logFC, y = -log10(adj.P.Val), colour = DElabel, label = genelabels)) +
geom_point(alpha = 0.33, size = 1.5) +
geom_text_repel(size = 4, colour = "black") +
guides(colour = guide_legend(override.aes = list(size = 3, alpha = 1))) +
scale_color_manual(
values = c("darkorange", "grey", "darkblue"), # Colors in desired order
name = "Upregulated", # Legend title
labels = c("CD8", "n/s", "CD4") # Correct legend labels
) +
geom_vline(xintercept = 1, linetype = "dotted") +
geom_vline(xintercept = -1, linetype = "dotted") +
theme_Publication()
plt2Some chemokines here but nothing interesting
Day15_cd8.vs.cd4 <- topTable(efit, coef=5, n=length(efit$genes$ID), sort.by = "logFC")
results <- as_tibble(Day15_cd8.vs.cd4)
results$ID <- rownames(Day15_cd8.vs.cd4)
# add a column of NAs
results$DElabel <- "n/s"
# if log2Foldchange > 1 or < -1 and pvalue < 0.05, set as "UP"
results$DElabel[results$logFC > 1 & results$adj.P.Val < 0.1] <- "CD8"
results$DElabel[results$logFC < -1 & results$adj.P.Val < 0.1] <- "CD4"Add gene labels
results$genelabels <- ""
results$genelabels[results$Symbol == "CD8B"] <- "CD8B"
results$genelabels[results$Symbol == "CD8A"] <- "CD8A"
results$genelabels[results$Symbol == "CD8B2"] <- "CD8B2"
results$genelabels[results$Symbol == "ZNF683"] <- "ZNF683"
results$genelabels[results$Symbol == "CD4"] <- "CD4"
results$genelabels[results$Symbol == "CCL1"] <- "CCL1"
results$genelabels[results$Symbol == "GZMB"] <- "GZMB"
results$genelabels[results$Symbol == "XCL1"] <- "XCL1"results$DElabel <- factor(results$DElabel, levels = c("CD8", "n/s", "CD4")) # Ensure correct order
plt3 <- ggplot(data = results, aes(x = logFC, y = -log10(adj.P.Val), colour = DElabel, label = genelabels)) +
geom_point(alpha = 0.33, size = 1.5) +
geom_text_repel(size = 4, colour = "black") +
guides(colour = guide_legend(override.aes = list(size = 3, alpha = 1))) +
scale_color_manual(
values = c("darkorange", "grey", "darkblue"), # Colors in desired order
name = "Upregulated", # Legend title
labels = c("CD8", "n/s", "CD4") # Correct legend labels
) +
geom_vline(xintercept = 1, linetype = "dotted") +
geom_vline(xintercept = -1, linetype = "dotted") +
theme_Publication()
plt3Some chemokines here but nothing interesting
Day5_cd8.vs.cd4 <- topTable(efit, coef=4, n=length(efit$genes$ID), sort.by = "logFC")
results <- as_tibble(Day5_cd8.vs.cd4)
results$ID <- rownames(Day5_cd8.vs.cd4)
# add a column of NAs
results$DElabel <- "n/s"
# if log2Foldchange > 1 or < -1 and pvalue < 0.05, set as "UP"
results$DElabel[results$logFC > 1 & results$adj.P.Val < 0.1] <- "CD8"
results$DElabel[results$logFC < -1 & results$adj.P.Val < 0.1] <- "CD4"Add gene labels
results$genelabels <- ""
results$genelabels[results$Symbol == "CD8B"] <- "CD8B"
results$genelabels[results$Symbol == "CD8A"] <- "CD8A"
results$genelabels[results$Symbol == "LINC02446"] <- "LINC02446"
results$genelabels[results$Symbol == "KLRD1"] <- "KLRD1"
results$genelabels[results$Symbol == "CD4"] <- "CD4"
results$genelabels[results$Symbol == "CD40LG"] <- "CD40LG"
results$genelabels[results$Symbol == "TIMP1"] <- "TIMP1"
results$genelabels[results$Symbol == "FHIT"] <- "FHIT"plt4 <- ggplot(data=results, aes(x=logFC, y=-log10(adj.P.Val), colour=DElabel, label=genelabels)) +
geom_point(alpha=0.33, size=1.5) +
geom_text_repel(size=4, colour="black", max.overlaps = 30) +
guides(colour = guide_legend(override.aes = list(size=3, alpha=1))) +
scale_color_manual(values = c("darkblue", "darkorange", "grey"), name = "Upregulated") +
geom_vline(xintercept = 1, linetype="dotted") +
geom_vline(xintercept = -1, linetype="dotted") +
theme_Publication()
plt4Genes are all expected. Good to see CD4 and CD8 as most DE genes.
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Red Hat Enterprise Linux 9.4 (Plow)
##
## Matrix products: default
## BLAS: /stornext/System/data/software/rhel/9/base/tools/R/4.4.1/lib64/R/lib/libRblas.so
## LAPACK: /stornext/System/data/software/rhel/9/base/tools/R/4.4.1/lib64/R/lib/libRlapack.so; LAPACK version 3.12.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
##
## attached base packages:
## [1] grid splines stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] ggthemes_5.1.0 here_1.0.1
## [3] knitr_1.48 patchwork_1.3.0
## [5] org.Hs.eg.db_3.19.1 AnnotationDbi_1.66.0
## [7] ggrepel_0.9.6 viridis_0.6.5
## [9] viridisLite_0.4.2 pheatmap_1.0.12
## [11] edgeR_4.2.2 limma_3.60.6
## [13] scater_1.32.1 scuttle_1.14.0
## [15] lubridate_1.9.3 forcats_1.0.0
## [17] stringr_1.5.1 dplyr_1.1.4
## [19] purrr_1.0.2 readr_2.1.5
## [21] tidyr_1.3.1 tibble_3.2.1
## [23] ggplot2_3.5.1 tidyverse_2.0.0
## [25] SingleCellExperiment_1.26.0 SummarizedExperiment_1.34.0
## [27] Biobase_2.64.0 GenomicRanges_1.56.2
## [29] GenomeInfoDb_1.40.1 IRanges_2.38.1
## [31] S4Vectors_0.42.1 BiocGenerics_0.50.0
## [33] MatrixGenerics_1.16.0 matrixStats_1.4.1
##
## loaded via a namespace (and not attached):
## [1] DBI_1.2.3 gridExtra_2.3
## [3] rlang_1.1.4 magrittr_2.0.3
## [5] compiler_4.4.1 RSQLite_2.3.7
## [7] DelayedMatrixStats_1.26.0 png_0.1-8
## [9] vctrs_0.6.5 pkgconfig_2.0.3
## [11] crayon_1.5.3 fastmap_1.2.0
## [13] XVector_0.44.0 labeling_0.4.3
## [15] utf8_1.2.4 rmarkdown_2.28
## [17] tzdb_0.4.0 UCSC.utils_1.0.0
## [19] ggbeeswarm_0.7.2 bit_4.5.0
## [21] bluster_1.14.0 xfun_0.48
## [23] zlibbioc_1.50.0 cachem_1.1.0
## [25] beachmat_2.20.0 jsonlite_1.8.9
## [27] blob_1.2.4 highr_0.11
## [29] DelayedArray_0.30.1 BiocParallel_1.38.0
## [31] cluster_2.1.6 irlba_2.3.5.1
## [33] parallel_4.4.1 R6_2.5.1
## [35] bslib_0.8.0 stringi_1.8.4
## [37] RColorBrewer_1.1-3 jquerylib_0.1.4
## [39] Rcpp_1.0.13 igraph_2.0.3
## [41] Matrix_1.7-0 timechange_0.3.0
## [43] tidyselect_1.2.1 rstudioapi_0.17.0
## [45] abind_1.4-8 yaml_2.3.10
## [47] codetools_0.2-20 lattice_0.22-6
## [49] KEGGREST_1.44.1 withr_3.0.1
## [51] evaluate_1.0.1 Biostrings_2.72.1
## [53] pillar_1.9.0 generics_0.1.3
## [55] rprojroot_2.0.4 hms_1.1.3
## [57] sparseMatrixStats_1.16.0 munsell_0.5.1
## [59] scales_1.3.0 glue_1.8.0
## [61] metapod_1.12.0 tools_4.4.1
## [63] BiocNeighbors_1.22.0 ScaledMatrix_1.12.0
## [65] locfit_1.5-9.10 scran_1.32.0
## [67] cowplot_1.1.3 colorspace_2.1-1
## [69] GenomeInfoDbData_1.2.12 beeswarm_0.4.0
## [71] BiocSingular_1.20.0 vipor_0.4.7
## [73] cli_3.6.3 rsvd_1.0.5
## [75] fansi_1.0.6 S4Arrays_1.4.1
## [77] gtable_0.3.5 sass_0.4.9
## [79] digest_0.6.37 dqrng_0.4.1
## [81] SparseArray_1.4.8 farver_2.1.2
## [83] memoise_2.0.1 htmltools_0.5.8.1
## [85] lifecycle_1.0.4 httr_1.4.7
## [87] statmod_1.5.0 bit64_4.5.2